package com.example.cadmin.utils.PDF;

import org.ansj.domain.Term;
import org.ansj.splitWord.analysis.IndexAnalysis;
import org.ansj.splitWord.analysis.NlpAnalysis;
import org.ansj.splitWord.analysis.ToAnalysis;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;

import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class readPDF {
    public static void main(String[] args) {
//        String pdfFilePath = "E:\\mysql安装与配置手册.pdf"; // 替换为你的PDF文件路径
//        String pdfFilePath = "C:\\Users\\Administrator\\Desktop\\test.pdf"; // 替换为你的PDF文件路径
        String pdfFilePath = "C:\\Users\\Administrator\\Desktop\\20240422150150971.pdf"; // 替换为你的PDF文件路径
        try {
            // 加载PDF文档
            PDDocument document = PDDocument.load(new File(pdfFilePath));

            // 创建PDFTextStripper对象
            PDFTextStripper pdfStripper = new PDFTextStripper();

            // 提取文本
            String text = pdfStripper.getText(document);
            System.out.println(text);
            Pattern p = Pattern.compile("\\s*|\t|\r|\n");
            Matcher m = p.matcher(text);
            String str1 = m.replaceAll("");
            System.out.println(str1);
            List<Term> parse = ToAnalysis.parse(str1).getTerms();
            System.out.println(parse);
//            List<Term> parse2 = NlpAnalysis.parse(str1).getTerms();
//            System.out.println(parse2);
//            System.out.println(IndexAnalysis.parse(str1));


            // 关闭文档
            document.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}